#!/bin/bash
XINFERENCE_MODEL_SRC=modelscope XINFERENCE_HOME=/xinference/xinference_cache xinference-local -H 0.0.0.0 &
while true; do
  if curl -s "http://localhost:9997" > /dev/null; then
    break
  else
    sleep 1
  fi
done

VLLM_PP_LAYER_PARTITION="31,30" xinference launch --model-engine vllm --model-name deepseek-r1 --n-gpu 16 --size-in-billions 671 --model-format awq --model_path /mnt/nvme1n1/deepseek-r1-awq --pipeline_parallel_size 2 --tensor_parallel_size 8 --gpu_memory_utilization 0.96 --max_num_seqs 64 --max_model_len $[1024*16] &
PID1=$!
wait $PID1
wait


